Q1
Load the data frame tut1 into R from the file tut1.RData available on the course page. The data frame contains the variables x and y.
Code
library (here)
library (texreg)
library (rms)
library (kableExtra)
library (dplyr)
library (ggplot2)
options (prType = "latex" )
options (prType= 'html' )
load (here ("data" ,'tut1.rdata' ))
knitr:: opts_chunk$ set (warning = FALSE ,
message = FALSE )
theme_set (
theme_bw (base_size = 18 ) +
theme (legend.position = "bottom" )
)
Fit a simple linear regression with y as the response and x as the predictor
fit0 <- ols (y ~ x, data = tut1)
or you can use glm (generalized linear model) or lm (linear model)
fit_glm <- glm (y~ x, data= tut1)
fit_lm <- lm (y~ x, data = tut1)
The results are identical!
Code
htmlreg (list (fit0, fit_lm,fit_glm),
ci.force = TRUE ,
#custom.coef.map = keepvars,
custom.model.names = c ("ols" , "lm" ,"glm" ))
Statistical models
ols
lm
glm
Intercept
1.22*
[ 0.80; 1.64]
x
-0.36*
-0.36*
-0.36*
[-0.48; -0.24]
[-0.48; -0.24]
[-0.48; -0.24]
(Intercept)
1.22*
1.22*
[ 0.80; 1.64]
[ 0.80; 1.64]
Num. obs.
100
100
100
R2
0.27
0.27
Adj. R2
0.27
0.27
L.R.
32.07
AIC
303.66
BIC
311.48
Log Likelihood
-148.83
Deviance
114.89
* Null hypothesis value outside the confidence interval.
Diagnostic plots for model checking
If the model is correctly specified, and all assumptions (known as LINE ) are met, the residuals should be randomly scattered around zero
pred_glm <- tut1 %>%
mutate (
yobs = y,
yhat = predict (fit_glm),
resid = y - yhat)
x_y_plot <- ggplot (pred_glm, aes (x,yobs) )+
geom_point ()+
geom_smooth ()
x_resid_plot <- ggplot (pred_glm, aes (x,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_glm, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
x_y_plot
x_resid_plot
yhat_resid_plot
predicted result using restricted cubic spline (skyblue) versus simple linear (black)
fit_rcs <- ols (y ~ rcs (x, 3 ), data = tut1)
pred_rcs <- tut1 %>%
mutate (
pred_rcs = predict (fit_rcs,data = tut1)
)
pred_rcs %>% ggplot ( aes (x = x, y = pred_rcs))+
# geom_point(size = 0.01)+
geom_line (color = "skyblue" )+
# geom_point(data =pred_glm, aes(x =x, y = yhat) ,size = 0.01, ) +
geom_line (data = pred_glm, aes (x = x, y = yhat),color = "black" ,alpha = 0.7 )
Check the diagnostic plots again (knots = 3)
pred_rcs <- tut1 %>%
mutate (
yobs = y,
yhat = predict (fit_rcs),
resid = y - yhat)
x_y_plot <- ggplot (pred_rcs, aes (x,yobs) )+
geom_point ()+
geom_smooth ()
x_resid_plot <- ggplot (pred_rcs, aes (x,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_rcs, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
x_y_plot
x_resid_plot
yhat_resid_plot
experiment with different knot values:
Code
fit_rcs <- ols (y ~ rcs (x, 5 ), data = tut1)
pred_rcs <- tut1 %>%
mutate (
yobs = y,
yhat = predict (fit_rcs),
resid = y - yhat)
x_y_plot <- ggplot (pred_rcs, aes (x,yobs) )+
geom_point ()+
geom_smooth ()
x_resid_plot <- ggplot (pred_rcs, aes (x,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_rcs, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
x_y_plot
x_resid_plot
yhat_resid_plot
Code
fit_rcs <- ols (y ~ rcs (x, 5 ), data = tut1)
pred_rcs <- tut1 %>%
mutate (
yobs = y,
yhat = predict (fit_rcs),
resid = y - yhat)
x_y_plot <- ggplot (pred_rcs, aes (x,yobs) )+
geom_point ()+
geom_smooth ()
x_resid_plot <- ggplot (pred_rcs, aes (x,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_rcs, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
x_y_plot
x_resid_plot
yhat_resid_plot
Conclusion: knots = 4 is good enough to fit the data! knots = 5 may lead to overfitting .
Q2
Q2 is a similar question but with more than one predictor!
Load the data frame FEV from the file FEV.RData. For these data, use the variable fev as the response and the rest as the explanatory covariates.
load (here ('data' ,'FEV.rdata' ))
FEV %>%
head () %>%
kable ()
301
9
1.708
57.0
female
non-current smoker
451
8
1.724
67.5
female
non-current smoker
501
7
1.720
54.5
female
non-current smoker
642
9
1.558
53.0
male
non-current smoker
901
9
1.895
57.0
male
non-current smoker
1701
8
2.336
61.0
female
non-current smoker
Fit an additive linear model to fev using the other variables as the covariates. Evaluate whether any of the continuous variables should be fit as non-linear terms.
fit the linear model
fev_lm <- ols (fev ~ age + height + sex + smoke,
data = FEV)
pred_fev <- FEV %>%
mutate (
yobs = fev,
yhat = predict (fev_lm),
resid = yobs - yhat)
age_resid_plot <- ggplot (pred_fev, aes (age,resid) )+
geom_point ()+
geom_smooth ()
height_resid_plot <- ggplot (pred_fev, aes (height,resid) )+
geom_point ()+
geom_smooth ()
age_resid_plot
height_resid_plot
Fit RCS:
fev_rcs <- ols (fev ~ rcs (age, 4 ) + rcs (height, 4 ) + sex + smoke,
data = FEV)
Again, check the diagnostic plots:
pred_fev <- FEV %>%
mutate (
yobs = fev,
yhat = predict (fev_rcs),
resid = yobs - yhat)
age_y_plot <- ggplot (pred_fev, aes (age,yobs) )+
geom_point ()+
geom_smooth ()
age_resid_plot <- ggplot (pred_fev, aes (age,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_fev, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
age_y_plot
age_resid_plot
yhat_resid_plot
height_y_plot <- ggplot (pred_fev, aes (height,yobs) )+
geom_point ()+
geom_smooth ()
height_resid_plot <- ggplot (pred_fev, aes (height,resid) )+
geom_point ()+
geom_smooth ()
yhat_resid_plot <- ggplot (pred_fev, aes (yhat,resid) )+
geom_point ()+
geom_smooth ()
height_y_plot
height_resid_plot
yhat_resid_plot